knitr::opts_chunk$set(echo = TRUE)
#Setting Working Directory
knitr::opts_knit$set(root.dir = "~/Desktop/Final Project/")
getwd()
## [1] "/Users/christian3/Desktop/Final Project"
#Install and Intialize Packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.1
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggthemes)
library(gifski)
library(gganimate)
library(png)
library(extrafont)
## Registering fonts with R
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(vars)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: strucchange
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
## Loading required package: urca
## Loading required package: lmtest
##
## Attaching package: 'vars'
##
## The following object is masked from 'package:tidyquant':
##
## VAR
library(sarima)
## Loading required package: stats4
##
## Attaching package: 'sarima'
##
## The following object is masked from 'package:stats':
##
## spectrum
library(lmtest)
#Read CSV
setwd("~/Desktop/Final Project/")
NYC <-read.csv("NYC_Property_Sales_Data.csv")
NYC
#Summary
summary(NYC)
## X BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY
## Min. : 4 Min. :1.000 Length:84548 Length:84548
## 1st Qu.: 4231 1st Qu.:2.000 Class :character Class :character
## Median : 8942 Median :3.000 Mode :character Mode :character
## Mean :10344 Mean :2.999
## 3rd Qu.:15987 3rd Qu.:4.000
## Max. :26739 Max. :5.000
## TAX.CLASS.AT.PRESENT BLOCK LOT EASE.MENT
## Length:84548 Min. : 1 Min. : 1.0 Mode:logical
## Class :character 1st Qu.: 1323 1st Qu.: 22.0 NA's:84548
## Mode :character Median : 3311 Median : 50.0
## Mean : 4237 Mean : 376.2
## 3rd Qu.: 6281 3rd Qu.:1001.0
## Max. :16322 Max. :9106.0
## BUILDING.CLASS.AT.PRESENT ADDRESS APARTMENT.NUMBER
## Length:84548 Length:84548 Length:84548
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS
## Min. : 0 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.:10305 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 1.000
## Median :11209 Median : 1.000 Median : 0.0000 Median : 1.000
## Mean :10732 Mean : 2.025 Mean : 0.1936 Mean : 2.249
## 3rd Qu.:11357 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.: 2.000
## Max. :11694 Max. :1844.000 Max. :2261.0000 Max. :2261.000
## LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE
## Length:84548 Length:84548 Min. : 0 Min. :1.000
## Class :character Class :character 1st Qu.:1920 1st Qu.:1.000
## Mode :character Mode :character Median :1940 Median :2.000
## Mean :1789 Mean :1.657
## 3rd Qu.:1965 3rd Qu.:2.000
## Max. :2017 Max. :4.000
## BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE
## Length:84548 Length:84548 Length:84548
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
#Cleaning
NYC <- NYC %>%
mutate(SALE.PRICE=as.numeric(SALE.PRICE))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SALE.PRICE = as.numeric(SALE.PRICE)`.
## Caused by warning:
## ! NAs introduced by coercion
NYC <- NYC%>%
mutate(SALE.DATE=as.Date(SALE.DATE),'%m/%d/%Y')
NYC<- NYC %>%
dplyr::select("BOROUGH","SALE.PRICE","SALE.DATE")
NYC<- NYC %>%
drop_na()
filter(NYC, SALE.PRICE > 0)
NYC$BOROUGH = case_when(
NYC$BOROUGH == "1" ~ "Manhatten",
NYC$BOROUGH == "2" ~ "Bronx",
NYC$BOROUGH == "3" ~ "Brooklyn",
NYC$BOROUGH == "4" ~ "Queens",
NYC$BOROUGH == "5" ~ "Staten Island",
)
NYC <- NYC %>%
group_by(month = lubridate::floor_date(SALE.DATE, 'month'), BOROUGH) %>%
summarize(
average_price = mean(SALE.PRICE))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
NYC <- NYC %>%
rename("borough" = "BOROUGH")
NYC
summary(NYC)
## month borough average_price
## Min. :2016-09-01 Length:60 Min. : 423580
## 1st Qu.:2016-11-23 Class :character 1st Qu.: 565888
## Median :2017-02-15 Mode :character Median : 721588
## Mean :2017-02-14 Mean :1203620
## 3rd Qu.:2017-05-08 3rd Qu.: 919570
## Max. :2017-08-01 Max. :4709871
#Simple Plot
ggplot(NYC, aes(x=month, y=average_price, color=borough))+
scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_date(date_labels="%b %Y",date_breaks = "1 month")+
geom_line()+
labs(title="NYC Average Property Sale Price")
#FacetWrap For Each Borough
ggplot(NYC, aes(x=month, y=average_price, color=borough))+
scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels = label_dollar()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_date(date_labels="%b %Y",date_breaks = "1 month")+
geom_line()+
labs(title="NYC Average Property Sale Price")+
facet_wrap(~borough)
#Plotting Graph + Animation
video = NYC %>%
ggplot(aes(x=month, y=average_price, color=borough)) +
geom_line(linewidth=2, alpha=0.75) + #thick lines
theme_solarized_2(light=FALSE) + # Change style of lighting
labs(title= "NYC Real Estate Sales 2016-2017",
y="Sale Price ($ USD)") +
theme(text=element_text(family="Courier", colour="#EEEEEE"),
title=element_text(color= "#EEEEEE"),
axis.title.x = element_blank(), # getting rid of x axis label
panel.background=element_rect(fill=NA), # getting rid of grid lines
plot.background=element_rect(fill="#111111"), # replace grid lines with darker tones
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(), #eliminating grid lines again
legend.background=element_blank(), # getting rid of legend background
legend.key=element_blank(), # getting rid of legend key
legend.position = "bottom", # moving legend to bottom
plot.title= element_text(hjust=0.5),
axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_colour_brewer(palette = "BuGn") +
geom_point()+ # set points on lines
scale_x_date(date_labels="%b '%y",date_breaks = "1 month")+
scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar())
video
#Animate
video.animation = video +
transition_reveal(month) +
view_follow(fixed_y=TRUE) #set y axis as fixed by move x axis
animate(video.animation,height=500, width=800,fps=30,duration=10,end=60,res=100)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
#TimeSeries - Cleaning Data
setwd("~/Desktop/Final Project/")
NYC <-read.csv("NYC_Property_Sales_Data.csv")
NYC <- NYC %>%
mutate(SALE.PRICE=as.numeric(SALE.PRICE))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SALE.PRICE = as.numeric(SALE.PRICE)`.
## Caused by warning:
## ! NAs introduced by coercion
NYC <- NYC%>%
mutate(SALE.DATE=as.Date(SALE.DATE),'%m/%d/%Y')
NYC<- NYC %>%
dplyr::select("BOROUGH","SALE.PRICE","SALE.DATE")
NYC<- NYC %>%
drop_na()
filter(NYC, SALE.PRICE > 0)
NYC <- NYC %>%
group_by(month = lubridate::floor_date(SALE.DATE, 'month'), BOROUGH) %>%
summarize(
average_price = mean(SALE.PRICE))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
NYC = subset(NYC, NYC$BOROUGH < 2)
summary(NYC)
## month BOROUGH average_price
## Min. :2016-09-01 Min. :1 Min. :2561120
## 1st Qu.:2016-11-23 1st Qu.:1 1st Qu.:2771101
## Median :2017-02-15 Median :1 Median :3017874
## Mean :2017-02-14 Mean :1 Mean :3320867
## 3rd Qu.:2017-05-08 3rd Qu.:1 3rd Qu.:3764480
## Max. :2017-08-01 Max. :1 Max. :4709871
#Creating 2 Different Data Frames
train <- NYC%>% filter(month < "2017-03-01")
hold_out <- NYC%>% filter(month >= "2017-03-01")
#Graphing Data Frame Before Prediction
ggplot(train) +
geom_line(aes(month, average_price)) +
labs(title = "Average Real Estate Price in Manhatten")
#ARIMA Modeling
arima_model <- arima(train$average_price, c(2, 1, 0), method = "ML")
arima_model
##
## Call:
## arima(x = train$average_price, order = c(2, 1, 0), method = "ML")
##
## Coefficients:
## ar1 ar2
## 0.4948 -0.8577
## s.e. 0.3150 0.1609
##
## sigma^2 estimated as 1.83e+11: log likelihood = -73.29, aic = 152.59
coeftest(arima_model)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.49475 0.31502 1.5705 0.1163
## ar2 -0.85768 0.16090 -5.3305 9.793e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#AR Performance
arima_prediction <- predict(arima_model, n.ahead = 12)
arima_prediction
## $pred
## Time Series:
## Start = 7
## End = 18
## Frequency = 1
## [1] 2310117 3494560 4295849 3676415 2682695 2722326 3594230 3991616 3440409
## [10] 2826866 2996073 3606015
##
## $se
## Time Series:
## Start = 7
## End = 18
## Frequency = 1
## [1] 427757.9 769285.5 856793.4 859331.1 870163.4 974574.8 1108175.0
## [8] 1151346.3 1158646.1 1180124.8 1255092.1 1332289.3
#Plotting our ARIMA Prediction
ggplot(cbind(hold_out[1:12,c("month", "average_price")], as.data.frame(arima_prediction)), aes(x = month)) +
geom_ribbon(aes(ymin = pred - se, ymax = pred + se), alpha = 0.25, fill = scales::muted("green")) +
geom_line(aes(y = pred), lty = 2) +
geom_line(aes(y = average_price)) +
scale_y_continuous(breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar()) +
labs(title = "ARIMA prediction of Average Real Estate Price in Manhatten", subtitle = "Actual = solid, prediciton = dashed, se = green")
## Warning: Removed 6 rows containing missing values (`geom_line()`).
## Removed 6 rows containing missing values (`geom_line()`).